Click to show code
print('hello')
#> [1] "hello"The following machine learning project focuses on…
print('hello')
#> [1] "hello"# Python code
import numpy as np
print(np.mean([10, 20, 30, 40, 50]))
#> 30.0properties <- read.csv(file.path(here(),"data/properties.csv"))
# show 1000 first rows of properties using reactable
reactable(head(properties, 1000))
# Create a tibble with cantons and observations
observations_table <- tibble(
Canton = c("Vaud", "Bern", "Lucerne", "Zurich", "Uri", "Schwyz",
"Obwalden", "Nidwalden", "Glarus", "St. Gallen", "Grisons",
"Aargau", "Thurgau", "Ticino", "Valais", "Neuchatel",
"Geneva", "Jura", "Zug", "Fribourg", "Solothurn",
"Basel-Stadt", "Basel-Landschaft", "Schaffhausen",
"Appenzell-Ausser-Rhoden", "Appenzell-Inner-Rhoden", "Total"),
Observations = c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12, sum(c(3232, 1553, 376, 1191, 71, 93, 29, 51, 55, 757, 405,
1481, 553, 4230, 3601, 513, 629, 329, 69, 1242, 590,
149, 705, 118, 102, 12)))
)
# Display the table using kable and kableExtra
observations_table %>%
kbl(caption = "Number of Observations by Canton") %>%
kable_styling(position = "center", bootstrap_options = c("striped", "bordered", "hover")) %>%
add_header_above(c(" " = 1, "Observations" = 1)) # Adds headers spanning columns| Canton | Observations |
|---|---|
| Vaud | 3232 |
| Bern | 1553 |
| Lucerne | 376 |
| Zurich | 1191 |
| Uri | 71 |
| Schwyz | 93 |
| Obwalden | 29 |
| Nidwalden | 51 |
| Glarus | 55 |
| St. Gallen | 757 |
| Grisons | 405 |
| Aargau | 1481 |
| Thurgau | 553 |
| Ticino | 4230 |
| Valais | 3601 |
| Neuchatel | 513 |
| Geneva | 629 |
| Jura | 329 |
| Zug | 69 |
| Fribourg | 1242 |
| Solothurn | 590 |
| Basel-Stadt | 149 |
| Basel-Landschaft | 705 |
| Schaffhausen | 118 |
| Appenzell-Ausser-Rhoden | 102 |
| Appenzell-Inner-Rhoden | 12 |
| Total | 22136 |
# Identify values causing the issue
problematic_values <- properties$number_of_rooms[is.na(as.numeric(properties$number_of_rooms))]
#> Warning: NAs introduced by coercion
# Replace non-numeric values with NA
properties$number_of_rooms <- as.numeric(gsub("[^0-9.]", "", properties$number_of_rooms))
# Remove non-numeric characters and convert to numeric
properties$price <- as.numeric(gsub("[^0-9]", "", properties$price))
# Subset the dataset to exclude rows with price < 20000
properties <- properties[properties$price >= 20000, ]
# Subset the dataset to exclude rows with numbers of rooms < 25
properties <- properties[properties$number_of_rooms <25, ]
# Replace incomplete addresses
properties$address <- gsub("^\\W*[.,0-]\\W*", "", properties$address)
properties_filtered <- na.omit(properties)
properties_filtered$year_category <- substr(properties_filtered$year_category, 1, 9)
# Assuming 'year_category' is a column in the 'properties' dataset
properties_filtered$year_category <- as.factor(properties_filtered$year_category)
# Preprocess the number_of_rooms column
properties_filtered$number_of_rooms <- as.character(properties_filtered$number_of_rooms)
properties_filtered$number_of_rooms <- gsub("\\D", "", properties_filtered$number_of_rooms) # Remove non-numeric characters
properties_filtered$number_of_rooms <- as.numeric(properties_filtered$number_of_rooms) # Convert to numeric
properties_filtered$number_of_rooms <- trunc(properties_filtered$number_of_rooms) # Truncate non-integer values
# remove m^2 from column 'square_meters'
properties_filtered$square_meters <- as.numeric(gsub("\\D", "", properties_filtered$square_meters))
# print how many NA observations left in square_meters
print(sum(is.na(properties_filtered$square_meters)))
#> [1] 988
# remove NA
properties_filtered <- properties_filtered[!is.na(properties_filtered$square_meters),]
# add majuscule to canton
properties_filtered$canton <- tools::toTitleCase(properties_filtered$canton)
# show 100 first row of cleaned dataset using reactable
reactable(head(properties_filtered, 100))
#filter properties_filtered to contains only 'price', 'number_of_rooms', 'square_meters'
properties_summary <- properties_filtered[, c('price', 'number_of_rooms', 'square_meters')]
#summary statistics
summary(properties_summary)
#> price number_of_rooms square_meters
#> Min. : 25000 Min. : 1.0 Min. : 1
#> 1st Qu.: 690000 1st Qu.: 35.0 1st Qu.: 99
#> Median : 992340 Median : 45.0 Median : 136
#> Mean : 1349710 Mean : 41.1 Mean : 159
#> 3rd Qu.: 1550000 3rd Qu.: 55.0 3rd Qu.: 190
#> Max. :26149500 Max. :185.0 Max. :2000
# Data
data <- data.frame(
price = c(25000, 690000, 992340, 1348429, 1550000, 26149500),
number_of_rooms = c(1.0, 35.0, 45.0, 41.1, 55.0, 185.0),
square_meters = c(1, 98, 136, 159, 190, 2000)
)
# Summary statistics
summary_stats <- summary(data)
# Summary statistics
summary_stats <- cbind(
Min = apply(data, 2, min),
Q1 = apply(data, 2, quantile, probs = 0.25),
Median = apply(data, 2, median),
Mean = apply(data, 2, mean),
Q3 = apply(data, 2, quantile, probs = 0.75),
Max = apply(data, 2, max)
)
# Create table
table <- kbl(summary_stats, align = rep('c', 6), caption = "Summary statistics for the dataset") %>%
kable_styling(position = "center", bootstrap_options = c("striped", "hover", "condensed", "responsive"))
table| Min | Q1 | Median | Mean | Q3 | Max | |
|---|---|---|---|---|---|---|
| price | 25000 | 765585.0 | 1170385 | 5.13e+06 | 1.50e+06 | 26149500 |
| number_of_rooms | 1 | 36.5 | 43 | 6.04e+01 | 5.25e+01 | 185 |
| square_meters | 1 | 107.5 | 148 | 4.31e+02 | 1.82e+02 | 2000 |
The dataset described is the “Official Index of Localities” (Répertoire officiel des localités) provided by the Swiss Federal Office of Topography (swisstopo). It contains comprehensive information about all localities in Switzerland and the Principality of Liechtenstein, including their names, postal codes, and perimeters.
This dataset is regularly updated on a monthly basis, incorporating changes reported by cantonal authorities and Swiss Post. It serves various purposes, including spatial analysis, integration with other geographic datasets, usage as a geolocated background in GIS (Geographic Information Systems) and CAD (Computer-Aided Design) systems, statistical analysis, and as a reference dataset for information systems.
Updates and release notes for the dataset are provided periodically, detailing changes and improvements made over time. The Swiss Federal Office of Topography manages and distributes this dataset as part of its responsibilities in collecting and providing official geospatial data for Switzerland.
Source - swisstopo
df <- properties_filtered
#the address column is like : '1844 Villeneuve VD' and has zip code number in it
#taking out the zip code number and creating a new column 'zip_code'
#the way to identify the zip code is to identify numbers that are 4 digits long
df$zip_code <- as.numeric(gsub("\\D", "", df$address))
#removing the first two number of zip code has more than 4 number
df$zip_code <- ifelse(df$zip_code > 9999, df$zip_code %% 10000, df$zip_code)#read .csv AMTOVZ_CSV_LV95
amto <- read.csv(file.path(here(),"data/AMTOVZ_CSV_WGS84.csv"), sep = ";")
#creating a new dataframe with 'Ortschaftsname' as 'City'Place_name', 'PLZ' as 'zip_code', 'KantonskÃ.rzel' as 'Canton_code', 'E' as 'lon' and 'N' as 'lat'
amto_df <- amto[, c('Gemeindename', 'PLZ', 'Kantonskürzel', 'E', 'N')]
#renaming the columns
colnames(amto_df) <- c('Community', 'zip_code', 'Canton_code', 'lon', 'lat')
#remove duplicates of zip code
amto_df <- amto_df[!duplicated(amto_df$zip_code),]
#add the variable of amto_df to the df if the zip code matches
df <- merge(df, amto_df, by = "zip_code", all.x = TRUE)
#check if there are nan in city
df[is.na(df$Community),]
#> zip_code price number_of_rooms square_meters
#> 1 25 2200000 65 165
#> 2 25 2200000 10 263
#> 3 26 655000 35 66
#> 4 26 1995000 75 180
#> 5 322 870000 25 59
#> 6 322 880000 25 55
#> 7 322 975000 35 56
#> 230 1014 1510000 55 146
#> 1137 1200 16092000 7 400
#> 1138 1200 3285450 5 230
#> 1139 1200 679000 55 142
#> 5478 1919 785000 35 103
#> 5479 1919 1908000 65 210
#> 5480 1919 2558620 55 270
#> 5481 1919 1065000 45 130
#> 7621 2500 1100000 5 154
#> 7622 2500 420000 45 115
#> 7623 2500 885500 55 130
#> 7624 2500 872500 45 144
#> 7625 2500 872500 45 138
#> 7626 2500 887500 55 130
#> 7627 2500 870500 45 125
#> 7628 2500 892500 45 144
#> 7629 2500 885500 55 130
#> 7630 2500 1050000 45 121
#> 7631 2500 877500 45 138
#> 7632 2500 887500 45 144
#> 7633 2500 1450000 55 198
#> 8325 3000 820000 55 165
#> 8326 3000 1140000 35 115
#> 8327 3000 1090000 35 115
#> 8328 3000 1090000 55 193
#> 8329 3000 920000 45 157
#> 8330 3000 1090000 55 193
#> 8331 3000 1590000 55 330
#> 8332 3000 720000 35 102
#> 8333 3000 920000 45 157
#> 10434 4000 180000 3 70
#> 10435 4000 975000 45 125
#> 10436 4000 2100000 65 360
#> 12358 5201 725000 35 95
#> 13212 6000 695000 45 133
#> 13964 6511 440000 2 64
#> 14240 6547 15000000 75 220
#> 14558 6602 2800000 75 242
#> 14559 6602 2800000 65 250
#> 14560 6602 270000 15 28
#> 14561 6602 450000 35 75
#> 14562 6604 1990000 45 220
#> 14563 6604 2668590 55 290
#> 14564 6604 760000 35 78
#> 16575 6901 3660930 45 290
#> 16576 6901 3660930 45 290
#> 16577 6903 790000 35 105
#> 16578 6907 995000 45 114
#> 16579 6907 995000 45 114
#> 16580 6911 469350 55 140
#> 16581 6911 610000 35 103
#> 16582 6911 660000 75 200
#> 16583 6911 737550 45 82
#> 17892 7133 2266290 55 160
#> 17901 7135 2690000 85 236
#> 18161 8000 2100000 45 152
#> 18162 8000 1650000 45 142
#> 18163 8000 925000 35 102
#> 18164 8000 1650000 45 142
#> 18165 8000 1150000 45 128
#> 18166 8000 1450000 55 143
#> 18167 8000 1990000 55 200
#> 18168 8000 1990000 55 200
#> 18169 8000 975000 45 122
#> 18170 8000 2495000 55 482
#> 18650 8238 245000 2 49
#> 19074 8423 2110000 65 204
#> 19075 8423 2190000 55 167
#> 20288 9241 545000 45 100
#> 20289 9241 730840 55 130
#> address
#> 1 1000 Lausanne 25
#> 2 1000 Lausanne 25
#> 3 1000 Lausanne 26
#> 4 Lausanne 26, 1000 Lausanne 26
#> 5 Via Cuolm Liung 30d, 7032 Laax GR 2
#> 6 7032 Laax GR 2
#> 7 Via Murschetg 29, 7032 Laax GR 2
#> 230 1014 Lausanne
#> 1137 1200 Genève
#> 1138 1200 Genève
#> 1139 Chemin des pralets, 74100 Etrembières, 1200 Genève
#> 5478 1919 Martigny
#> 5479 1919 Martigny
#> 5480 1919 Martigny
#> 5481 1919 Martigny
#> 7621 2500 Biel/Bienne
#> 7622 2500 Biel/Bienne
#> 7623 2500 Biel/Bienne
#> 7624 2500 Biel/Bienne
#> 7625 2500 Biel/Bienne
#> 7626 2500 Biel/Bienne
#> 7627 2500 Biel/Bienne
#> 7628 2500 Biel/Bienne
#> 7629 2500 Biel/Bienne
#> 7630 Hohlenweg 11b, 2500 Biel/Bienne
#> 7631 2500 Biel/Bienne
#> 7632 2500 Biel/Bienne
#> 7633 2500 Bienne
#> 8325 3000 Bern
#> 8326 3000 Bern
#> 8327 3000 Bern
#> 8328 3000 Bern
#> 8329 3000 Bern
#> 8330 3000 Bern
#> 8331 3000 Bern
#> 8332 3000 Bern
#> 8333 3000 Bern
#> 10434 Lörrach Brombach Steinsack 6, 4000 Basel
#> 10435 4000 Basel
#> 10436 4000 Basel
#> 12358 5201 Brugg AG
#> 13212 in TRIENGEN, ca. 20 min. bei Luzern, 6000 Luzern
#> 13964 6511 Cadenazzo
#> 14240 Augio 1F, 6547 Augio
#> 14558 6602 Muralto
#> 14559 6602 Muralto
#> 14560 6602 Muralto
#> 14561 Via Bacilieri 2, 6602 Muralto
#> 14562 6604 Solduno
#> 14563 6604 Solduno
#> 14564 6604 Locarno
#> 16575 6901 Lugano
#> 16576 6901 Lugano
#> 16577 6903 Lugano
#> 16578 6907 MASSAGNO
#> 16579 6907 MASSAGNO
#> 16580 6911 Campione d'Italia
#> 16581 6911 Campione d'Italia
#> 16582 6911 Campione d'Italia
#> 16583 6911 Campione d'Italia
#> 17892 Inder Platenga 34, 7133 Obersaxen
#> 17901 7135 Fideris
#> 18161 8000 Zürich
#> 18162 8000 Zürich
#> 18163 8000 Zürich
#> 18164 8000 Zürich
#> 18165 8000 Zürich
#> 18166 8000 Zürich
#> 18167 8000 Zürich
#> 18168 8000 Zürich
#> 18169 8000 Zürich
#> 18170 8000 Zürich
#> 18650 Stemmerstrasse 14, 8238 Büsingen am Hochrhein
#> 19074 Chüngstrasse 60, 8423 Embrach
#> 19075 Chüngstrasse 48, 8423 Embrach
#> 20288 9241 Kradolf
#> 20289 9241 Kradolf
#> canton property_type floor year_category Community
#> 1 Vaud Villa 2006-2010 <NA>
#> 2 Vaud Single house 1919-1945 <NA>
#> 3 Vaud Apartment noteg 2016-2024 <NA>
#> 4 Vaud Villa 1961-1970 <NA>
#> 5 Grisons Apartment eg 2016-2024 <NA>
#> 6 Grisons Apartment noteg 2016-2024 <NA>
#> 7 Grisons Apartment noteg 2011-2015 <NA>
#> 230 Vaud Apartment eg 2011-2015 <NA>
#> 1137 Geneva Single house 2011-2015 <NA>
#> 1138 Geneva Bifamiliar house 1981-1990 <NA>
#> 1139 Geneva Bifamiliar house 2016-2024 <NA>
#> 5478 Valais Apartment noteg 2016-2024 <NA>
#> 5479 Valais Apartment noteg 2016-2024 <NA>
#> 5480 Valais Attic flat noteg 2016-2024 <NA>
#> 5481 Valais Apartment noteg 2016-2024 <NA>
#> 7621 Bern Single house 2001-2005 <NA>
#> 7622 Bern Apartment noteg 1971-1980 <NA>
#> 7623 Bern Villa 2016-2024 <NA>
#> 7624 Bern Villa 2016-2024 <NA>
#> 7625 Bern Single house 2016-2024 <NA>
#> 7626 Bern Single house 2016-2024 <NA>
#> 7627 Bern Single house 2016-2024 <NA>
#> 7628 Bern Single house 2016-2024 <NA>
#> 7629 Bern Single house 2016-2024 <NA>
#> 7630 Bern Single house 2001-2005 <NA>
#> 7631 Bern Single house 2016-2024 <NA>
#> 7632 Bern Single house 2016-2024 <NA>
#> 7633 Bern Single house 2016-2024 <NA>
#> 8325 Bern Apartment noteg 2016-2024 <NA>
#> 8326 Bern Apartment eg 2016-2024 <NA>
#> 8327 Bern Apartment eg 2016-2024 <NA>
#> 8328 Bern Roof flat noteg 2016-2024 <NA>
#> 8329 Bern Apartment noteg 2016-2024 <NA>
#> 8330 Bern Apartment noteg 2016-2024 <NA>
#> 8331 Bern Apartment noteg 1991-2000 <NA>
#> 8332 Bern Apartment eg 2016-2024 <NA>
#> 8333 Bern Duplex noteg 2016-2024 <NA>
#> 10434 Basel-Stadt Single house 1961-1970 <NA>
#> 10435 Basel-Stadt Single house 2016-2024 <NA>
#> 10436 Basel-Stadt Villa 2016-2024 <NA>
#> 12358 Aargau Apartment noteg 2016-2024 <NA>
#> 13212 Lucerne Apartment noteg 1991-2000 <NA>
#> 13964 Ticino Apartment noteg 2016-2024 <NA>
#> 14240 Grisons Single house 2016-2024 <NA>
#> 14558 Ticino Single house 1981-1990 <NA>
#> 14559 Ticino Single house 1981-1990 <NA>
#> 14560 Ticino Apartment eg 1961-1970 <NA>
#> 14561 Ticino Apartment noteg 1946-1960 <NA>
#> 14562 Ticino Attic flat noteg 2011-2015 <NA>
#> 14563 Ticino Apartment noteg 2011-2015 <NA>
#> 14564 Ticino Apartment noteg 2011-2015 <NA>
#> 16575 Ticino Attic flat noteg 2011-2015 <NA>
#> 16576 Ticino Apartment noteg 2011-2015 <NA>
#> 16577 Ticino Apartment noteg 2006-2010 <NA>
#> 16578 Ticino Apartment noteg 2016-2024 <NA>
#> 16579 Ticino Apartment noteg 2016-2024 <NA>
#> 16580 Ticino Apartment noteg 1946-1960 <NA>
#> 16581 Ticino Apartment eg 1946-1960 <NA>
#> 16582 Ticino Single house 1971-1980 <NA>
#> 16583 Ticino Apartment noteg 1991-2000 <NA>
#> 17892 Grisons Single house 2006-2010 <NA>
#> 17901 Grisons Single house 0-1919 <NA>
#> 18161 Zurich Apartment noteg 2016-2024 <NA>
#> 18162 Zurich Attic flat noteg 2016-2024 <NA>
#> 18163 Zurich Apartment noteg 2016-2024 <NA>
#> 18164 Zurich Apartment noteg 2016-2024 <NA>
#> 18165 Zurich Apartment noteg 2016-2024 <NA>
#> 18166 Zurich Apartment eg 2016-2024 <NA>
#> 18167 Zurich Apartment noteg 2006-2010 <NA>
#> 18168 Zurich Attic flat noteg 2006-2010 <NA>
#> 18169 Zurich Single house 2016-2024 <NA>
#> 18170 Zurich Apartment noteg 0-1919 <NA>
#> 18650 Schaffhausen Apartment noteg 1961-1970 <NA>
#> 19074 Zurich Bifamiliar house 2016-2024 <NA>
#> 19075 Zurich Single house 2016-2024 <NA>
#> 20288 Thurgau Apartment noteg 1991-2000 <NA>
#> 20289 Thurgau Apartment noteg 1991-2000 <NA>
#> Canton_code lon lat
#> 1 <NA> NA NA
#> 2 <NA> NA NA
#> 3 <NA> NA NA
#> 4 <NA> NA NA
#> 5 <NA> NA NA
#> 6 <NA> NA NA
#> 7 <NA> NA NA
#> 230 <NA> NA NA
#> 1137 <NA> NA NA
#> 1138 <NA> NA NA
#> 1139 <NA> NA NA
#> 5478 <NA> NA NA
#> 5479 <NA> NA NA
#> 5480 <NA> NA NA
#> 5481 <NA> NA NA
#> 7621 <NA> NA NA
#> 7622 <NA> NA NA
#> 7623 <NA> NA NA
#> 7624 <NA> NA NA
#> 7625 <NA> NA NA
#> 7626 <NA> NA NA
#> 7627 <NA> NA NA
#> 7628 <NA> NA NA
#> 7629 <NA> NA NA
#> 7630 <NA> NA NA
#> 7631 <NA> NA NA
#> 7632 <NA> NA NA
#> 7633 <NA> NA NA
#> 8325 <NA> NA NA
#> 8326 <NA> NA NA
#> 8327 <NA> NA NA
#> 8328 <NA> NA NA
#> 8329 <NA> NA NA
#> 8330 <NA> NA NA
#> 8331 <NA> NA NA
#> 8332 <NA> NA NA
#> 8333 <NA> NA NA
#> 10434 <NA> NA NA
#> 10435 <NA> NA NA
#> 10436 <NA> NA NA
#> 12358 <NA> NA NA
#> 13212 <NA> NA NA
#> 13964 <NA> NA NA
#> 14240 <NA> NA NA
#> 14558 <NA> NA NA
#> 14559 <NA> NA NA
#> 14560 <NA> NA NA
#> 14561 <NA> NA NA
#> 14562 <NA> NA NA
#> 14563 <NA> NA NA
#> 14564 <NA> NA NA
#> 16575 <NA> NA NA
#> 16576 <NA> NA NA
#> 16577 <NA> NA NA
#> 16578 <NA> NA NA
#> 16579 <NA> NA NA
#> 16580 <NA> NA NA
#> 16581 <NA> NA NA
#> 16582 <NA> NA NA
#> 16583 <NA> NA NA
#> 17892 <NA> NA NA
#> 17901 <NA> NA NA
#> 18161 <NA> NA NA
#> 18162 <NA> NA NA
#> 18163 <NA> NA NA
#> 18164 <NA> NA NA
#> 18165 <NA> NA NA
#> 18166 <NA> NA NA
#> 18167 <NA> NA NA
#> 18168 <NA> NA NA
#> 18169 <NA> NA NA
#> 18170 <NA> NA NA
#> 18650 <NA> NA NA
#> 19074 <NA> NA NA
#> 19075 <NA> NA NA
#> 20288 <NA> NA NA
#> 20289 <NA> NA NAWe have 77 NAN, where
Removed them ::: {.cell layout-align=“center”}
#remove the rows with nan in city
properties_filtered <- df[!is.na(df$Community),]
reactable(head(properties_filtered, 100)):::
# read csv
impots <- read.csv(file.path(here(),"data/estv_income_rates.csv"), sep = ",", header = TRUE, stringsAsFactors = FALSE)
# Remove 1st row
impots <- impots[-1, ]
# Remove 3rd column
impots <- impots[, -3]
# Combine text for columns 4-8
impots[1, 4:8] <- "Impôt sur le revenu"
# Combine text for columns 9-13
impots[1, 9:13] <- "Impôt sur la fortune"
# Combine text for columns 14-16
impots[1, 14:16] <- "Impôt sur le bénéfice"
# Combine text for columns 17-19
impots[1, 17:19] <- "Impôt sur le capital"
# Combine content of the first 2 rows into the 2nd row
impots[2, ] <- apply(impots[1:2, ], 2, function(x) paste(ifelse(is.na(x[1]), x[2], ifelse(is.na(x[2]), x[1], paste(x[1], x[2], sep = " ")))))
# Remove 1st row
impots <- impots[-1, ]
# Assign the text to the 1st row and 1st column
impots[1, 1] <- "Coefficient d'impôt en %"
# Replace column names with the content of the first row
colnames(impots) <- impots[1, ]
impots <- impots[-1, ]
# Check for missing values in impots
any_missing <- any(is.na(impots))
if (any_missing) {
print("There are missing values in impots.")
} else {
print("There are no missing values in impots.")
}
#> [1] "There are no missing values in impots."
# Replace row names with the content of the 3rd column
row.names(impots) <- impots[, 3]
impots <- impots[, -3]
# Remove 2nd column (to avoid canton column)
impots <- impots[, -2]
# Remove impot eglise
impots <- impots[, -c(4:6)]
impots <- impots[, -c(6:8)]
impots <- impots[, -8]
impots <- impots[, -10]
# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))
# Replace NA values with 0
cleaned_impots[is.na(cleaned_impots)] <- 0
# Check for non-numeric values
non_numeric <- sum(!is.na(cleaned_impots) & !is.numeric(cleaned_impots))
if (non_numeric > 0) {
print(paste("Warning: Found", non_numeric, "non-numeric values."))
}
rownames(cleaned_impots) <- rownames(impots)
#reactable(head(cleaned_impots, 100))Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
# il faudra changer le path
commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
# We keep only 2019 to have some reference? (2020 is apparently not really complete)
commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
# delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
commune_2019 <- subset(commune_2019, STATUS == "A") %>%
select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
# on enlève les lignes qui sont des aggrégats
commune_2019 <- subset(commune_2019, REGION != "Schweiz")
commune_2019 <- commune_2019 %>%
pivot_wider(names_from = INDICATORS, values_from = VALUE)
# Rename columns using the provided map
commune <- commune_2019 %>%
rename(`Population - Habitants` = Ind_01_01,
`Population - Densité de la population` = Ind_01_03,
`Population - Etrangers` = Ind_01_08,
`Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
`Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
`Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
`Population - Taux brut de nuptialité` = Ind_01_09,
`Population - Taux brut de divortialité` = Ind_01_10,
`Population - Taux brut de natalité` = Ind_01_11,
`Population - Taux brut de mortalité` = Ind_01_12,
`Population - Ménages privés` = Ind_01_13,
`Population - Taille moyenne des ménages` = Ind_01_14,
`Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
`Conseil national - PLR` = Ind_14_01,
`Conseil national - PDC` = Ind_14_02,
`Conseil national - PS` = Ind_14_03,
`Conseil national - UDC` = Ind_14_04,
`Conseil national - PEV/PCS` = Ind_14_05,
`Conseil national - PVL` = Ind_14_06,
`Conseil national - PBD` = Ind_14_07,
`Conseil national - PST/Sol.` = Ind_14_08,
`Conseil national - PES` = Ind_14_09,
`Conseil national - Petits partis de droite` = Ind_14_10)
# If no one voted for a party, set as NA -> replacing it with 0 instead
commune <- commune %>%
mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
# Removing NAs from Taux de couverture sociale column
# Setting the mean as the mean for Switzerland in 2019 (3.2%)
mean_taux_aide_social <- 3.2
# Replace NA values with the mean
commune <- commune %>%
mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
#show 100 first rows of commune using reactable
reactable(head(commune, 100))
# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)
#
# # We keep only 2019 to have some reference? (2020 is apparently not really complete)
# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))
#
# # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne
# commune_2019 <- subset(commune_2019, STATUS == "A") %>%
# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))
#
# # on enlève les lignes qui sont des aggrégats
# commune_2019 <- subset(commune_2019, REGION != "Schweiz")
#
# commune_2019 <- commune_2019 %>%
# pivot_wider(names_from = INDICATORS, values_from = VALUE)
#
# # Rename columns using the provided map
# commune <- commune_2019 %>%
# rename(`Population - Habitants` = Ind_01_01,
# `Population - Densité de la population` = Ind_01_03,
# `Population - Etrangers` = Ind_01_08,
# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,
# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,
# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,
# `Population - Taux brut de nuptialité` = Ind_01_09,
# `Population - Taux brut de divortialité` = Ind_01_10,
# `Population - Taux brut de natalité` = Ind_01_11,
# `Population - Taux brut de mortalité` = Ind_01_12,
# `Population - Ménages privés` = Ind_01_13,
# `Population - Taille moyenne des ménages` = Ind_01_14,
# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,
# `Conseil national - PLR` = Ind_14_01,
# `Conseil national - PDC` = Ind_14_02,
# `Conseil national - PS` = Ind_14_03,
# `Conseil national - UDC` = Ind_14_04,
# `Conseil national - PEV/PCS` = Ind_14_05,
# `Conseil national - PVL` = Ind_14_06,
# `Conseil national - PBD` = Ind_14_07,
# `Conseil national - PST/Sol.` = Ind_14_08,
# `Conseil national - PES` = Ind_14_09,
# `Conseil national - Petits partis de droite` = Ind_14_10)
#
# # If no one voted for a party, set as NA -> replacing it with 0 instead
# commune <- commune %>%
# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))
#
#
# # Removing NAs from Taux de couverture sociale column
# # Setting the mean as the mean for Switzerland in 2019 (3.2%)
# mean_taux_aide_social <- 3.2
#
# # Replace NA values with the mean
# commune <- commune %>%
# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))
# # Create a leaflet map with optimized markers
map <- leaflet(properties_filtered) %>%
addTiles() %>% # Add default OpenStreetMap tiles
addProviderTiles(providers$Esri.NatGeoWorldMap) %>% # Add topographic maps for context
addCircleMarkers(
~lon, ~lat,
radius = 1.5, # Smaller radius for the circle markers
color = "#32012F", # Specifying a color for the markers
fillOpacity = 0.2, # Semi-transparent fill
stroke = FALSE, # No border to the circle markers to reduce visual noise
popup = ~paste("Price: ", price, "<br>",
"Rooms: ", number_of_rooms, "<br>",
"Type: ", property_type, "<br>",
"Year: ", year_category),
label = ~paste("Price: ", price) # Tooltip on hover
) %>% addLegend(
position = "bottomright", # Position the legend at the bottom right
colors = "#32012F", # Use the same color as the markers
labels = "Properties" # Label for the legend
)
map$width <- "100%" # Set the width of the map to 100%
map$height <- 600 # Set the height of the map to 600 pixels
maphistogram_price <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "red") +
labs(title = "Distribution of Prices",
x = "Price",
y = "Frequency") +
theme_minimal()
# Convert ggplot object to plotly object
interactive_histogram_price <- ggplotly(histogram_price, width = 600, height = 400 )
# Display the interactive histogram
interactive_histogram_pricenote : only price between 0 and 500000 so some outliers aren’t here
# Create the ggplot object
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ property_type, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Property Type",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
# Create a histogram of prices for each year category
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ year_category, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Year Category",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object
interactive_histogram_year <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram_yearnote : only price between 0 and 500000 so some outliers aren’t here
histogram <- ggplot(properties_filtered, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ canton, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Canton",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000) %>%
layout(height = 1000) # Adjust the height as needed
# Display the interactive plot
interactive_histogramnote : only price between 0 and 500000 so some outliers aren’t here
and the graph below only show apartments with less than 10 rooms (but you can change the code if needed
properties_room <- properties_filtered[properties_filtered$number_of_rooms < 20, ] # Filter only number_of_rooms less than 20
# Create a histogram of prices for each number of rooms
histogram <- ggplot(properties_room, aes(x = price)) +
geom_histogram(binwidth = 100000, fill = "skyblue", color = "black") +
facet_wrap(~ number_of_rooms, scales = "free", ncol = 2) +
labs(title = "Distribution of Prices by Number of Rooms",
x = "Price",
y = "Frequency") +
theme_minimal() +
xlim(0, 5000000)
# Convert ggplot object to plotly object with adjusted height
interactive_histogram <- ggplotly(histogram, width = 600, height = 1000)
# Display the interactive plot
interactive_histogram# colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)] <- gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) - 3):ncol(properties_filtered)])
#
# # Create a scatter plot to visualize correlation between price and Impôt cantonal
# scatter_plot <- ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +
# geom_point() +
# labs(title = "Correlation between Price and Impôt cantonal",
# x = "Price",
# y = "Impôt cantonal") +
# theme_minimal()
#
# # Convert ggplot object to plotly object
# interactive_plot <- ggplotly(scatter_plot)
#
# # Display the interactive plot
# interactive_plotimpot_cols <- names(properties_filtered)[startsWith(names(properties_filtered), "Impôt")]
# Count the number of NA values in selected columns
na_counts <- colSums(is.na(properties_filtered[impot_cols]))
# Print the counts
print(na_counts)
#> numeric(0)Trying to Cluster commune datas to: 1. Reduce dimension 2. See similarities
A regarder, est-ce qu’on fait un cluster pour les datas politques + un cluster pour les data démographiques, ou est-ce qu’on regroupe tout?
set.seed(123)
# Clustering demographic
cols_commune_demographic <- select(commune, -c("REGION", "CODE_REGION","Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))
# Scale the columns, some are total numbers, some are percentages
cols_commune_demographic <- scale(cols_commune_demographic)
# Calculate the distance matrix
dist_matrix_demographic <- dist(cols_commune_demographic, method = "minkowski")
# Perform hierarchical clustering
hclust_model_demographic <- hclust(dist_matrix_demographic, method = "ward.D")
# Create dendrogram
dend_demo <- as.dendrogram(hclust_model_demographic)
dend_demo <- color_branches(dend_demo, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables
plot(dend_demo, main = "Demographics - Hierarchical Clustering Dendrogram")# Clustering politics
set.seed(123)
cols_commune_politics <- select(commune, c("Conseil national - PLR","Conseil national - PDC", "Conseil national - PS", "Conseil national - UDC", "Conseil national - PEV/PCS", "Conseil national - PVL", "Conseil national - PBD", "Conseil national - PST/Sol.", "Conseil national - PES", "Conseil national - Petits partis de droite"))
# Scale the columns, some are total numbers, some are percentages
cols_commune_politics <- scale(cols_commune_politics)
# Calculate the distance matrix
dist_matrix_politics <- dist(cols_commune_politics, method = "minkowski")
# Perform hierarchical clustering
hclust_model_politics <- hclust(dist_matrix_politics, method = "ward.D")
# Create dendrogram
dend_pol <- as.dendrogram(hclust_model_politics)
dend_pol <- color_branches(dend_pol, k = 5) #Set number of cluster to 5, to keep the same scale for all our variables
plot(dend_pol, main = "Politics - Hierarchical Clustering Dendrogram")set.seed(123)
# Clean data and convert to numeric
cleaned_impots <- apply(impots, 2, function(x) as.numeric(gsub("[^0-9.-]", "", x)))
cleaned_impots[is.na(cleaned_impots)] <- 0 # Replace NA values with 0
# Scale the features
scaled_impots <- scale(cleaned_impots)
# Perform k-means clustering
k <- 2 # Initial guess for the number of clusters
kmeans_model <- kmeans(scaled_impots, centers = k)
# Check within-cluster sum of squares (elbow method)
wss <- numeric(10)
for (i in 1:10) {
kmeans_model <- kmeans(scaled_impots, centers = i)
wss[i] <- sum(kmeans_model$withinss)
}
# plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")
# Adjust k based on elbow method
k <- 5
# Perform k-means clustering again with optimal k
kmeans_model <- kmeans(scaled_impots, centers = k)
# Assign cluster labels to dendrogram
clusters <- kmeans_model$cluster
# Plot dendrogram
#colored_dend <- color_branches(dend, k = 5)
#y_zoom_range <- c(0, 80) # Adjust the y-axis range as needed
#plot(colored_dend, main = "Hierarchical Clustering Dendrogram", horiz = FALSE, ylim = y_zoom_range)# Get the cluster centers
cluster_centers <- kmeans_model$centers
# Create a data frame with cluster centers
cluster_centers_df <- data.frame(cluster = 1:k, cluster_centers)
# Print cluster centers
print(cluster_centers_df)
#> cluster Coefficient.d.impôt.en.. Impôt.sur.le.revenu.Canton
#> 1 1 0.400 -0.611
#> 2 2 0.786 -0.390
#> 3 3 -0.318 -0.266
#> 4 4 -0.792 -0.699
#> 5 5 -0.941 1.932
#> Impôt.sur.le.revenu.Commune Impôt.sur.la.fortune.Canton
#> 1 -0.0839 -0.615
#> 2 -0.5732 -0.394
#> 3 0.9387 -0.270
#> 4 -0.3119 -0.690
#> 5 1.3332 1.933
#> Impôt.sur.la.fortune.Commune Impôt.sur.le.bénéfice.Canton
#> 1 -0.0849 1.951
#> 2 -0.5725 -0.249
#> 3 0.9383 -0.869
#> 4 -0.3128 -0.709
#> 5 1.3330 1.402
#> Impôt.sur.le.bénéfice.Commune Impôt.sur.le.capital.Canton
#> 1 -1.3419 1.879
#> 2 -0.6444 -0.276
#> 3 1.9896 -0.874
#> 4 0.0565 -0.718
#> 5 0.8732 1.492
#> Impôt.sur.le.capital.Commune
#> 1 -1.30508
#> 2 -0.65169
#> 3 1.81595
#> 4 0.00514
#> 5 1.01512
# Calculate the size of each cluster
cluster_sizes <- table(kmeans_model$cluster)
# Print cluster sizes
print(cluster_sizes)
#>
#> 1 2 3 4 5
#> 75 999 178 462 417
# Get the cluster labels
cluster_labels <- kmeans_model$cluster
# Convert cleaned_impots to a data frame
impots_cluster <- as.data.frame(cleaned_impots)
# Add the cluster labels to cleaned_impots
impots_cluster$cluster <- cluster_labels
rownames(impots_cluster) <- rownames(impots)
impots_cluster <- impots_cluster %>%
rownames_to_column(var = "Community")# Preparing df_commune for merging with main dataset
df_commune <- select(commune, REGION)
df_commune$Demographic_cluster <- cutree(hclust_model_demographic, k = 5)
df_commune$Political_cluster <- cutree(hclust_model_politics, k = 5)
# Preparing to merge
merging <- inner_join(amto_df, df_commune, by = c("Community" = "REGION"))
impots_cluster_subset <- impots_cluster[, c("Community", "cluster")]
merging <- merging %>%
left_join(impots_cluster_subset, by = "Community")
clusters_df <- merging %>%
rename(Tax_cluster = cluster) %>%
rename(Commune = Community)
clusters_df <- clusters_df %>%
select(c("Commune", "zip_code", "Canton_code", "Demographic_cluster", "Political_cluster", "Tax_cluster"))
# Only NAs are for commune Brugg, (written Brugg (AG) in the other data set) -> j'entre le cluster à la mano
clusters_df$Tax_cluster[is.na(clusters_df$Tax_cluster)] <- 2
# adding it to our main data set:
properties_filtered <- merge(properties_filtered, clusters_df[, c("zip_code", "Demographic_cluster", "Political_cluster", "Tax_cluster")], by = "zip_code", all.x = TRUE)
na_count <- sum(is.na(properties_filtered[, c("Demographic_cluster", "Political_cluster", "Tax_cluster")]))
# Print the result
if (na_count > 0) {
print("There are NA values in the merged dataframe.")
print(na_count)
} else {
print("There are no NA values in the merged dataframe.")
}
#> [1] "There are NA values in the merged dataframe."
#> [1] 678
# Find rows with NA values in the specified columns
na_rows <- subset(properties_filtered, is.na(Demographic_cluster) | is.na(Political_cluster) | is.na(Tax_cluster))This section provides a comprehensive outline of the linear regression model and analysis methods employed in the study of property price determinants.
The study was conducted using R, a programming language and environment widely recognized for its robust capabilities in statistical computing and graphics. Key packages used include:
corrplot for visualization of correlation matrices, aiding in preliminary feature selection. car for diagnostic testing and variance inflation factor (VIF) analysis to detect multicollinearity among predictors.caret for creating training and testing sets, and managing cross-validation processes.ggplot2 and plotly for creating visual representations of model diagnostics, predictions, and residuals.gtsummary for creating publication-ready tables summarizing regression analysis results.Each of these tools was chosen for its specific functionality that aids in robust data analysis, ensuring that each step of the model building and evaluation process is well-supported.
Initially, a correlation analysis was conducted to identify and visualize linear relationships between the property prices and potential predictive variables such as the number of rooms and square meters. The correlation matrix was computed and plotted using the corrplot package:
correlation_matrix <- cor(properties_filtered[ , c("price", "number_of_rooms", "square_meters")], use="complete.obs")
corrplot(correlation_matrix, method="square", type="upper", tl.col="black", tl.srt=45, tl.cex=0.8, cl.cex=0.8, addCoef.col="black", number.cex=0.8, order="hclust", hclust.method="complete", tl.pos="lt", tl.offset=0.5, cl.pos="n", cl.length=1.5)This step helped in highlighting significant predictors for the regression model.
To ensure the stability of the regression model, a multicollinearity check was performed using the Variance Inflation Factor (VIF), calculated through the car package. High VIF values would indicate predictors that have strong linear relationships with other predictors, thus potentially destabilizing the model: ::: {.cell layout-align=“center”}
## Multicollinearity Check
model_for_vif <- lm(price ~ number_of_rooms + square_meters + canton + floor + year_category , data=properties_filtered)
vif_values <- vif(model_for_vif)
#show the result in the html
kable(vif_values, format = "html") %>%
kable_styling(full_width = F)| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| number_of_rooms | 1.14 | 1 | 1.07 |
| square_meters | 1.40 | 1 | 1.18 |
| canton | 1.27 | 25 | 1.00 |
| floor | 1.51 | 2 | 1.11 |
| year_category | 1.32 | 10 | 1.01 |
:::
The dataset was split into training and testing sets to validate the model’s performance. The linear regression model was then fitted using selected predictors: ::: {.cell layout-align=“center”}
# Model Building
## Split the Data
set.seed(123) # for reproducibility
trainIndex <- createDataPartition(properties_filtered$price, p=0.8, list=FALSE)
trainData <- properties_filtered[trainIndex, ]
testData <- properties_filtered[-trainIndex, ]
## Fit the Model
lm_model <- lm(price ~ number_of_rooms + square_meters + property_type + floor + year_category , data=trainData):::
Diagnostic checks such as residual analysis and normality tests were conducted to validate model assumptions. Performance metrics including R-squared and RMSE were calculated to assess the model’s explanatory power and prediction accuracy. ::: {.cell layout-align=“center”}
# Model Evaluation
## Diagnostic Checks
plot(lm_model)
ad.test(residuals(lm_model))
#>
#> Anderson-Darling normality test
#>
#> data: residuals(lm_model)
#> A = 1185, p-value <2e-16
#use gt summary to show the result
tbl_reg_1 <- gtsummary::tbl_regression(lm_model)
tbl_reg_1| Characteristic | Beta | 95% CI1 | p-value |
|---|---|---|---|
| number_of_rooms | -4,412 | -5,171, -3,652 | <0.001 |
| square_meters | 9,514 | 9,341, 9,687 | <0.001 |
| property_type | |||
| Apartment | — | — | |
| Attic flat | 123,389 | 42,733, 204,046 | 0.003 |
| Bifamiliar house | -180,630 | -255,518, -105,742 | <0.001 |
| Chalet | 111,186 | 17,273, 205,098 | 0.020 |
| Duplex | -67,817 | -166,299, 30,665 | 0.2 |
| Farm house | -723,754 | -935,634, -511,875 | <0.001 |
| Loft | -230,757 | -704,831, 243,317 | 0.3 |
| Roof flat | -29,263 | -143,286, 84,761 | 0.6 |
| Rustic house | -178,693 | -615,345, 257,959 | 0.4 |
| Single house | -171,465 | -215,243, -127,687 | <0.001 |
| Terrace flat | 20,661 | -132,447, 173,769 | 0.8 |
| Villa | 100,792 | 30,364, 171,219 | 0.005 |
| floor | |||
| floor | — | — | |
| eg | 16,885 | -28,844, 62,615 | 0.5 |
| noteg | |||
| year_category | |||
| 0-1919 | — | — | |
| 1919-1945 | 185,794 | 80,670, 290,918 | <0.001 |
| 1946-1960 | 273,863 | 174,947, 372,779 | <0.001 |
| 1961-1970 | 261,688 | 177,731, 345,644 | <0.001 |
| 1971-1980 | 309,697 | 234,861, 384,532 | <0.001 |
| 1981-1990 | 289,655 | 214,233, 365,076 | <0.001 |
| 1991-2000 | 350,078 | 271,632, 428,523 | <0.001 |
| 2001-2005 | 455,963 | 360,573, 551,353 | <0.001 |
| 2006-2010 | 496,521 | 412,785, 580,256 | <0.001 |
| 2011-2015 | 580,291 | 498,687, 661,896 | <0.001 |
| 2016-2024 | 421,002 | 357,726, 484,277 | <0.001 |
| 1 CI = Confidence Interval | |||
:::
Stepwise regression was performed to refine the model and improve its predictive performance. The resulting model was evaluated using the same diagnostic checks and performance metrics as the initial model:
# stepwise regression
lm_model2 <- step(lm_model)
#> Start: AIC=458951
#> price ~ number_of_rooms + square_meters + property_type + floor +
#> year_category
#>
#> Df Sum of Sq RSS AIC
#> - floor 1 4.89e+11 1.55e+16 458950
#> <none> 1.55e+16 458951
#> - property_type 10 1.24e+14 1.56e+16 459064
#> - number_of_rooms 1 1.21e+14 1.56e+16 459079
#> - year_category 10 2.58e+14 1.58e+16 459206
#> - square_meters 1 1.09e+16 2.64e+16 467784
#>
#> Step: AIC=458950
#> price ~ number_of_rooms + square_meters + property_type + year_category
#>
#> Df Sum of Sq RSS AIC
#> <none> 1.55e+16 458950
#> - number_of_rooms 1 1.21e+14 1.56e+16 459077
#> - property_type 11 1.65e+14 1.57e+16 459104
#> - year_category 10 2.60e+14 1.58e+16 459207
#> - square_meters 1 1.09e+16 2.64e+16 467782
plot(lm_model2)
ad.test(residuals(lm_model2))
#>
#> Anderson-Darling normality test
#>
#> data: residuals(lm_model2)
#> A = 1185, p-value <2e-16
## Performance Metrics
print(summary(lm_model2)$r.squared)
#> [1] 0.488
print(summary(lm_model2)$adj.r.squared)
#> [1] 0.487
print(rmse(testData$price, predict(lm_model2, newdata=testData)))
#> [1] 961793
print(mae(testData$price, predict(lm_model2, newdata=testData)))
#> [1] 490294
#use gt summary to show the result
tbl_reg_2 <- gtsummary::tbl_regression(lm_model2)
tbl_reg_3 <- tbl_merge(
tbls= list(tbl_reg_1, tbl_reg_2),
tab_spanner = c("**Model 1**", "**Mode Stepwise**")
)
tbl_reg_3| Characteristic | Model 1 | Mode Stepwise | ||||
|---|---|---|---|---|---|---|
| Beta | 95% CI1 | p-value | Beta | 95% CI1 | p-value | |
| number_of_rooms | -4,412 | -5,171, -3,652 | <0.001 | -4,414 | -5,173, -3,655 | <0.001 |
| square_meters | 9,514 | 9,341, 9,687 | <0.001 | 9,514 | 9,341, 9,687 | <0.001 |
| property_type | ||||||
| Apartment | — | — | — | — | ||
| Attic flat | 123,389 | 42,733, 204,046 | 0.003 | 119,562 | 39,576, 199,548 | 0.003 |
| Bifamiliar house | -180,630 | -255,518, -105,742 | <0.001 | -184,549 | -258,680, -110,419 | <0.001 |
| Chalet | 111,186 | 17,273, 205,098 | 0.020 | 107,539 | 14,148, 200,930 | 0.024 |
| Duplex | -67,817 | -166,299, 30,665 | 0.2 | -67,943 | -166,423, 30,537 | 0.2 |
| Farm house | -723,754 | -935,634, -511,875 | <0.001 | -727,087 | -938,771, -515,402 | <0.001 |
| Loft | -230,757 | -704,831, 243,317 | 0.3 | -230,627 | -704,694, 243,439 | 0.3 |
| Roof flat | -29,263 | -143,286, 84,761 | 0.6 | -33,134 | -146,672, 80,405 | 0.6 |
| Rustic house | -178,693 | -615,345, 257,959 | 0.4 | -181,988 | -618,543, 254,566 | 0.4 |
| Single house | -171,465 | -215,243, -127,687 | <0.001 | -174,995 | -217,716, -132,274 | <0.001 |
| Terrace flat | 20,661 | -132,447, 173,769 | 0.8 | 22,024 | -131,038, 175,085 | 0.8 |
| Villa | 100,792 | 30,364, 171,219 | 0.005 | 97,151 | 27,419, 166,884 | 0.006 |
| floor | ||||||
| floor | — | — | ||||
| eg | 16,885 | -28,844, 62,615 | 0.5 | |||
| noteg | ||||||
| year_category | ||||||
| 0-1919 | — | — | — | — | ||
| 1919-1945 | 185,794 | 80,670, 290,918 | <0.001 | 186,120 | 81,001, 291,238 | <0.001 |
| 1946-1960 | 273,863 | 174,947, 372,779 | <0.001 | 273,911 | 174,997, 372,826 | <0.001 |
| 1961-1970 | 261,688 | 177,731, 345,644 | <0.001 | 261,482 | 177,529, 345,435 | <0.001 |
| 1971-1980 | 309,697 | 234,861, 384,532 | <0.001 | 309,148 | 234,329, 383,968 | <0.001 |
| 1981-1990 | 289,655 | 214,233, 365,076 | <0.001 | 289,706 | 214,286, 365,127 | <0.001 |
| 1991-2000 | 350,078 | 271,632, 428,523 | <0.001 | 350,343 | 271,903, 428,784 | <0.001 |
| 2001-2005 | 455,963 | 360,573, 551,353 | <0.001 | 456,454 | 361,075, 551,834 | <0.001 |
| 2006-2010 | 496,521 | 412,785, 580,256 | <0.001 | 496,845 | 413,116, 580,575 | <0.001 |
| 2011-2015 | 580,291 | 498,687, 661,896 | <0.001 | 580,723 | 499,128, 662,318 | <0.001 |
| 2016-2024 | 421,002 | 357,726, 484,277 | <0.001 | 422,396 | 359,234, 485,558 | <0.001 |
| 1 CI = Confidence Interval | ||||||
Cross-validation was performed to enhance the robustness of the model, using the caret package to manage this process efficiently:
## Cross-Validation
cv_results <- train(price ~ number_of_rooms + square_meters + year_category + property_type, data=trainData, method="lm", trControl=trainControl(method="cv", number=10))
summary(cv_results)
#>
#> Call:
#> lm(formula = .outcome ~ ., data = dat)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -12191041 -343578 -108534 206072 16137015
#>
#> Coefficients:
#> Estimate Std. Error t value
#> (Intercept) -303862.9 34945.9 -8.70
#> number_of_rooms -4413.9 387.4 -11.39
#> square_meters 9513.9 88.2 107.87
#> `year_category1919-1945` 186119.6 53629.1 3.47
#> `year_category1946-1960` 273911.2 50463.8 5.43
#> `year_category1961-1970` 261482.1 42831.0 6.10
#> `year_category1971-1980` 309148.3 38171.1 8.10
#> `year_category1981-1990` 289706.3 38477.8 7.53
#> `year_category1991-2000` 350343.4 40018.6 8.75
#> `year_category2001-2005` 456454.4 48660.2 9.38
#> `year_category2006-2010` 496845.3 42716.8 11.63
#> `year_category2011-2015` 580723.0 41627.8 13.95
#> `year_category2016-2024` 422395.6 32223.8 13.11
#> `property_typeAttic flat` 119561.7 40806.9 2.93
#> `property_typeBifamiliar house` -184549.5 37819.9 -4.88
#> property_typeChalet 107539.1 47645.7 2.26
#> property_typeDuplex -67942.8 50242.1 -1.35
#> `property_typeFarm house` -727086.9 107996.4 -6.73
#> property_typeLoft -230627.4 241857.6 -0.95
#> `property_typeRoof flat` -33133.5 57924.7 -0.57
#> `property_typeRustic house` -181988.2 222719.8 -0.82
#> `property_typeSingle house` -174995.3 21795.3 -8.03
#> `property_typeTerrace flat` 22023.8 78088.3 0.28
#> property_typeVilla 97151.1 35575.9 2.73
#> Pr(>|t|)
#> (Intercept) < 2e-16 ***
#> number_of_rooms < 2e-16 ***
#> square_meters < 2e-16 ***
#> `year_category1919-1945` 0.00052 ***
#> `year_category1946-1960` 5.8e-08 ***
#> `year_category1961-1970` 1.1e-09 ***
#> `year_category1971-1980` 5.9e-16 ***
#> `year_category1981-1990` 5.4e-14 ***
#> `year_category1991-2000` < 2e-16 ***
#> `year_category2001-2005` < 2e-16 ***
#> `year_category2006-2010` < 2e-16 ***
#> `year_category2011-2015` < 2e-16 ***
#> `year_category2016-2024` < 2e-16 ***
#> `property_typeAttic flat` 0.00339 **
#> `property_typeBifamiliar house` 1.1e-06 ***
#> property_typeChalet 0.02402 *
#> property_typeDuplex 0.17630
#> `property_typeFarm house` 1.7e-11 ***
#> property_typeLoft 0.34032
#> `property_typeRoof flat` 0.56732
#> `property_typeRustic house` 0.41387
#> `property_typeSingle house` 1.0e-15 ***
#> `property_typeTerrace flat` 0.77792
#> property_typeVilla 0.00632 **
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 966000 on 16627 degrees of freedom
#> Multiple R-squared: 0.488, Adjusted R-squared: 0.487
#> F-statistic: 690 on 23 and 16627 DF, p-value: <2e-16
#show the CV result in the html
kable(cv_results$results, format = "html") %>%
kable_styling(full_width = F)| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 965585 | 0.488 | 505323 | 75728 | 0.048 | 17774 |
The final model was tested using the unseen test dataset to evaluate its predictive accuracy. Residual plots and actual vs. predicted price plots were created to visually assess model performance:
# Model Testing
## Test the Model
predicted_prices <- predict(lm_model2, newdata=testData)
testData$predicted_prices <- predicted_prices # Add to testData to ensure alignment
# Calculate residuals
testData$test_residuals <- testData$price - predicted_prices # Manually compute residuals
# Residual Analysis
gg <- ggplot(data = testData, aes(x = predicted_prices, y = test_residuals)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Residuals vs Predicted Prices", x = "Predicted Prices", y = "Residuals")
# Convert ggplot to plotly
p <- ggplotly(gg, width = 600, height = 400)
# Show the interactive plot
p## Visualization
gg <- ggplot(data=testData, aes(x=predicted_prices, y=price)) +
geom_point() +
geom_smooth(method="lm", col="blue") +
labs(title="Actual vs Predicted Prices", x="Predicted Prices", y="Actual Prices")
# Convert ggplot to plotly
p <- ggplotly(gg, width = 600, height = 400)
# Show the interactive plot
p#filter properties_filtered based on the 10th percentile and 90th percentile of the data
properties_filtered <- properties_filtered %>%
filter(price >= quantile(price, 0.1) & price <= quantile(price, 0.9))correlation_matrix <- cor(properties_filtered[ , c("price", "number_of_rooms", "square_meters")], use="complete.obs")
corrplot(correlation_matrix, method="square", type="upper", tl.col="black", tl.srt=45, tl.cex=0.8, cl.cex=0.8, addCoef.col="black", number.cex=0.8, order="hclust", hclust.method="complete", tl.pos="lt", tl.offset=0.5, cl.pos="n", cl.length=1.5)## Multicollinearity Check
model_for_vif <- lm(price ~ number_of_rooms + square_meters + canton + floor + year_category , data=properties_filtered)
vif_values <- vif(model_for_vif)
#show the result in the html
kable(vif_values, format = "html") %>%
kable_styling(full_width = F)| GVIF | Df | GVIF^(1/(2*Df)) | |
|---|---|---|---|
| number_of_rooms | 1.14 | 1 | 1.07 |
| square_meters | 1.47 | 1 | 1.21 |
| canton | 1.26 | 25 | 1.00 |
| floor | 1.57 | 2 | 1.12 |
| year_category | 1.38 | 10 | 1.02 |
# Model Building
## Split the Data
set.seed(123) # for reproducibility
trainIndex <- createDataPartition(properties_filtered$price, p=0.8, list=FALSE)
trainData <- properties_filtered[trainIndex, ]
testData <- properties_filtered[-trainIndex, ]
## Fit the Model
lm_model <- lm(price ~ number_of_rooms + square_meters + property_type + floor + year_category , data=trainData)# Model Evaluation
## Diagnostic Checks
plot(lm_model)
ad.test(residuals(lm_model))
#>
#> Anderson-Darling normality test
#>
#> data: residuals(lm_model)
#> A = 269, p-value <2e-16
#use gt summary to show the result
tbl_reg_1 <- gtsummary::tbl_regression(lm_model)
tbl_reg_1| Characteristic | Beta | 95% CI1 | p-value |
|---|---|---|---|
| number_of_rooms | 562 | 186, 939 | 0.003 |
| square_meters | 3,292 | 3,179, 3,404 | <0.001 |
| property_type | |||
| Apartment | — | — | |
| Attic flat | 132,991 | 98,497, 167,485 | <0.001 |
| Bifamiliar house | 140,901 | 109,557, 172,246 | <0.001 |
| Chalet | 134,968 | 88,857, 181,079 | <0.001 |
| Duplex | 48,255 | 7,076, 89,434 | 0.022 |
| Farm house | 100,683 | -2,282, 203,649 | 0.055 |
| Loft | -33,310 | -243,716, 177,097 | 0.8 |
| Roof flat | 14,092 | -36,083, 64,268 | 0.6 |
| Rustic house | -60,425 | -370,610, 249,760 | 0.7 |
| Single house | 89,394 | 69,382, 109,405 | <0.001 |
| Terrace flat | 97,389 | 32,432, 162,346 | 0.003 |
| Villa | 145,526 | 111,979, 179,072 | <0.001 |
| floor | |||
| floor | — | — | |
| eg | 30,959 | 10,648, 51,270 | 0.003 |
| noteg | |||
| year_category | |||
| 0-1919 | — | — | |
| 1919-1945 | 41,679 | -8,703, 92,061 | 0.10 |
| 1946-1960 | 62,224 | 15,770, 108,678 | 0.009 |
| 1961-1970 | 139,243 | 98,588, 179,898 | <0.001 |
| 1971-1980 | 160,547 | 124,516, 196,579 | <0.001 |
| 1981-1990 | 148,940 | 113,063, 184,816 | <0.001 |
| 1991-2000 | 143,285 | 106,562, 180,008 | <0.001 |
| 2001-2005 | 284,294 | 241,045, 327,543 | <0.001 |
| 2006-2010 | 338,373 | 299,579, 377,168 | <0.001 |
| 2011-2015 | 312,075 | 274,185, 349,965 | <0.001 |
| 2016-2024 | 205,849 | 175,396, 236,301 | <0.001 |
| 1 CI = Confidence Interval | |||
# stepwise regression
lm_model2 <- step(lm_model)
#> Start: AIC=343865
#> price ~ number_of_rooms + square_meters + property_type + floor +
#> year_category
#>
#> Df Sum of Sq RSS AIC
#> <none> 1.99e+15 343865
#> - number_of_rooms 1 1.28e+12 1.99e+15 343872
#> - floor 1 1.33e+12 1.99e+15 343872
#> - property_type 10 1.31e+13 2.00e+15 343933
#> - year_category 10 7.71e+13 2.07e+15 344353
#> - square_meters 1 4.93e+14 2.48e+15 346818
plot(lm_model2)
ad.test(residuals(lm_model2))
#>
#> Anderson-Darling normality test
#>
#> data: residuals(lm_model2)
#> A = 269, p-value <2e-16
## Performance Metrics
print(summary(lm_model2)$r.squared)
#> [1] 0.31
print(summary(lm_model2)$adj.r.squared)
#> [1] 0.309
print(rmse(testData$price, predict(lm_model2, newdata=testData)))
#> [1] 382817
print(mae(testData$price, predict(lm_model2, newdata=testData)))
#> [1] 298006
#use gt summary to show the result
tbl_reg_2 <- gtsummary::tbl_regression(lm_model2)
tbl_reg_3 <- tbl_merge(
tbls= list(tbl_reg_1, tbl_reg_2),
tab_spanner = c("**Model 1**", "**Mode Stepwise**")
)
tbl_reg_3| Characteristic | Model 1 | Mode Stepwise | ||||
|---|---|---|---|---|---|---|
| Beta | 95% CI1 | p-value | Beta | 95% CI1 | p-value | |
| number_of_rooms | 562 | 186, 939 | 0.003 | 562 | 186, 939 | 0.003 |
| square_meters | 3,292 | 3,179, 3,404 | <0.001 | 3,292 | 3,179, 3,404 | <0.001 |
| property_type | ||||||
| Apartment | — | — | — | — | ||
| Attic flat | 132,991 | 98,497, 167,485 | <0.001 | 132,991 | 98,497, 167,485 | <0.001 |
| Bifamiliar house | 140,901 | 109,557, 172,246 | <0.001 | 140,901 | 109,557, 172,246 | <0.001 |
| Chalet | 134,968 | 88,857, 181,079 | <0.001 | 134,968 | 88,857, 181,079 | <0.001 |
| Duplex | 48,255 | 7,076, 89,434 | 0.022 | 48,255 | 7,076, 89,434 | 0.022 |
| Farm house | 100,683 | -2,282, 203,649 | 0.055 | 100,683 | -2,282, 203,649 | 0.055 |
| Loft | -33,310 | -243,716, 177,097 | 0.8 | -33,310 | -243,716, 177,097 | 0.8 |
| Roof flat | 14,092 | -36,083, 64,268 | 0.6 | 14,092 | -36,083, 64,268 | 0.6 |
| Rustic house | -60,425 | -370,610, 249,760 | 0.7 | -60,425 | -370,610, 249,760 | 0.7 |
| Single house | 89,394 | 69,382, 109,405 | <0.001 | 89,394 | 69,382, 109,405 | <0.001 |
| Terrace flat | 97,389 | 32,432, 162,346 | 0.003 | 97,389 | 32,432, 162,346 | 0.003 |
| Villa | 145,526 | 111,979, 179,072 | <0.001 | 145,526 | 111,979, 179,072 | <0.001 |
| floor | ||||||
| floor | — | — | — | — | ||
| eg | 30,959 | 10,648, 51,270 | 0.003 | 30,959 | 10,648, 51,270 | 0.003 |
| noteg | ||||||
| year_category | ||||||
| 0-1919 | — | — | — | — | ||
| 1919-1945 | 41,679 | -8,703, 92,061 | 0.10 | 41,679 | -8,703, 92,061 | 0.10 |
| 1946-1960 | 62,224 | 15,770, 108,678 | 0.009 | 62,224 | 15,770, 108,678 | 0.009 |
| 1961-1970 | 139,243 | 98,588, 179,898 | <0.001 | 139,243 | 98,588, 179,898 | <0.001 |
| 1971-1980 | 160,547 | 124,516, 196,579 | <0.001 | 160,547 | 124,516, 196,579 | <0.001 |
| 1981-1990 | 148,940 | 113,063, 184,816 | <0.001 | 148,940 | 113,063, 184,816 | <0.001 |
| 1991-2000 | 143,285 | 106,562, 180,008 | <0.001 | 143,285 | 106,562, 180,008 | <0.001 |
| 2001-2005 | 284,294 | 241,045, 327,543 | <0.001 | 284,294 | 241,045, 327,543 | <0.001 |
| 2006-2010 | 338,373 | 299,579, 377,168 | <0.001 | 338,373 | 299,579, 377,168 | <0.001 |
| 2011-2015 | 312,075 | 274,185, 349,965 | <0.001 | 312,075 | 274,185, 349,965 | <0.001 |
| 2016-2024 | 205,849 | 175,396, 236,301 | <0.001 | 205,849 | 175,396, 236,301 | <0.001 |
| 1 CI = Confidence Interval | ||||||
## Cross-Validation
cv_results <- train(price ~ number_of_rooms + square_meters + year_category + property_type, data=trainData, method="lm", trControl=trainControl(method="cv", number=10))
summary(cv_results)
#>
#> Call:
#> lm(formula = .outcome ~ ., data = dat)
#>
#> Residuals:
#> Min 1Q Median 3Q Max
#> -3908178 -264451 -91098 202047 1588028
#>
#> Coefficients:
#> Estimate Std. Error t value Pr(>|t|)
#> (Intercept) 367183.9 17895.9 20.52 < 2e-16
#> number_of_rooms 556.9 192.2 2.90 0.0038
#> square_meters 3289.7 57.3 57.40 < 2e-16
#> `year_category1919-1945` 41801.7 25710.8 1.63 0.1040
#> `year_category1946-1960` 62133.7 23706.4 2.62 0.0088
#> `year_category1961-1970` 138669.4 20746.2 6.68 2.4e-11
#> `year_category1971-1980` 159375.1 18383.3 8.67 < 2e-16
#> `year_category1981-1990` 148814.8 18308.5 8.13 4.7e-16
#> `year_category1991-2000` 143478.5 18740.4 7.66 2.0e-14
#> `year_category2001-2005` 284998.7 22069.5 12.91 < 2e-16
#> `year_category2006-2010` 338739.4 19797.1 17.11 < 2e-16
#> `year_category2011-2015` 312960.1 19333.9 16.19 < 2e-16
#> `year_category2016-2024` 208499.4 15514.9 13.44 < 2e-16
#> `property_typeAttic flat` 126047.5 17448.9 7.22 5.3e-13
#> `property_typeBifamiliar house` 133942.7 15824.9 8.46 < 2e-16
#> property_typeChalet 128718.6 23438.0 5.49 4.0e-08
#> property_typeDuplex 48241.9 21014.4 2.30 0.0217
#> `property_typeFarm house` 95018.3 52511.1 1.81 0.0704
#> property_typeLoft -31141.6 107371.8 -0.29 0.7718
#> `property_typeRoof flat` 7104.6 25498.4 0.28 0.7805
#> `property_typeRustic house` -66301.2 158280.9 -0.42 0.6753
#> `property_typeSingle house` 83236.8 10002.0 8.32 < 2e-16
#> `property_typeTerrace flat` 100292.7 33134.5 3.03 0.0025
#> property_typeVilla 138896.8 16974.9 8.18 3.0e-16
#>
#> (Intercept) ***
#> number_of_rooms **
#> square_meters ***
#> `year_category1919-1945`
#> `year_category1946-1960` **
#> `year_category1961-1970` ***
#> `year_category1971-1980` ***
#> `year_category1981-1990` ***
#> `year_category1991-2000` ***
#> `year_category2001-2005` ***
#> `year_category2006-2010` ***
#> `year_category2011-2015` ***
#> `year_category2016-2024` ***
#> `property_typeAttic flat` ***
#> `property_typeBifamiliar house` ***
#> property_typeChalet ***
#> property_typeDuplex *
#> `property_typeFarm house` .
#> property_typeLoft
#> `property_typeRoof flat`
#> `property_typeRustic house`
#> `property_typeSingle house` ***
#> `property_typeTerrace flat` **
#> property_typeVilla ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 386000 on 13340 degrees of freedom
#> Multiple R-squared: 0.31, Adjusted R-squared: 0.308
#> F-statistic: 260 on 23 and 13340 DF, p-value: <2e-16
#show the CV result in the html
kable(cv_results$results, format = "html") %>%
kable_styling(full_width = F)| intercept | RMSE | Rsquared | MAE | RMSESD | RsquaredSD | MAESD |
|---|---|---|---|---|---|---|
| TRUE | 386801 | 0.309 | 298951 | 11109 | 0.037 | 6161 |
# Model Testing
## Test the Model
predicted_prices <- predict(lm_model2, newdata=testData)
testData$predicted_prices <- predicted_prices # Add to testData to ensure alignment
# Calculate residuals
testData$test_residuals <- testData$price - predicted_prices # Manually compute residuals
# Residual Analysis
gg <- ggplot(data = testData, aes(x = predicted_prices, y = test_residuals)) +
geom_point() +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Residuals vs Predicted Prices", x = "Predicted Prices", y = "Residuals")
# Convert ggplot to plotly
p <- ggplotly(gg, width = 600, height = 400)
# Show the interactive plot
p## Visualization
gg <- ggplot(data=testData, aes(x=predicted_prices, y=price)) +
geom_point() +
geom_smooth(method="lm", col="blue") +
labs(title="Actual vs Predicted Prices", x="Predicted Prices", y="Actual Prices")
# Convert ggplot to plotly
p <- ggplotly(gg, width = 600, height = 400)
# Show the interactive plot
p